#Installing Packages:

#install.packages("GGally")
#install.packages("caret")
#install.packages("mgcv")
#install.packages("glmnet")
#install.packages("randomForest")
#install.packages("mice")
#install.packages("rmarkdown")
#install.packages("knitr")
#Import the libraries:

library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(readxl) # Load readxl package
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

#Data Preparation:

#Load the data:

df2 <- read_excel("/Users/hetal/Downloads/RedactedClientConstituent_File.xlsx")
head(df2)
# Check for missing values
print(colSums(is.na(df2)))
##                      CnBio_ID               First Gift Date 
##                             0                         27073 
##                Last Gift Date             Largest Gift Date 
##                         27073                         27073 
##               CnBio_DateAdded             CnBio_DateChanged 
##                             0                             0 
##           CnBio_Key_Indicator                CnBio_Deceased 
##                             0                             0 
##                 CnBio_Title_1          CnBio_Marital_status 
##                         29027                         24985 
##                          City                         State 
##                          1267                          1374 
##             Reunions attended                           Zip 
##                             0                          1372 
##                 CnAdrPrf_Type       CnCnstncy_1_01_CodeLong 
##                             0                          4882 
##       CnCnstncy_1_02_CodeLong       CnCnstncy_1_03_CodeLong 
##                         37625                         42088 
##       CnCnstncy_1_04_CodeLong       CnSpPrBs_RecordImportID 
##                         42280                         42162 
##        CnRelEdu_1_01_Class_of          CnRelEdu_1_01_Degree 
##                         25089                         20635 
##          CnRelEdu_1_02_Degree           Alumni Board Member 
##                         39900                         42259 
##            Married to an Alum            Personal Email End 
##                         41713                         18106 
##         Total Lifetime Giving          Last 10 Years Giving 
##                             0                             0 
## Last 5 Years Giving (FY17-21)                  CnSpSpBio_ID 
##                             0                         39885 
##                        Alumni 
##                             0
#Converting the columns for easy accessibility:

colnames(df2)[colnames(df2) == "Total Lifetime Giving"] <- "Total_Lifetime_Giving"
colnames(df2)[colnames(df2) == "Last 5 Years Giving (FY17-21)"] <- "Last_5_Years_Giving"
colnames(df2)[colnames(df2) == "Last 10 Years Giving"] <- "Last_10_Years_Giving"
colnames(df2)[colnames(df2) == "Alumni Board Member"] <- "Alumni_Board_Member"
colnames(df2)[colnames(df2) == "Last Gift Date"] <- "Last_Gift_Date"
colnames(df2)[colnames(df2) == "First Gift Date"] <- "First_Gift_Date"
colnames(df2)[colnames(df2) == "Largest Gift Date"] <- "Largest_Gift_Date"
colnames(df2)[colnames(df2) == "Personal Email End"] <- "Personal_Email_End"
colnames(df2)[colnames(df2) == "Reunions attended"] <- "Reunions_attended"
colnames(df2)[colnames(df2) == "Married to an Alum"] <- "Married_to_an_Alum"
colnames(df2)[colnames(df2) == "Last Gift Date"] <- "Last_Gift_Date"
##Shape of DataFrame:
dim(df2)
## [1] 42287    31
#Finding the datatype of variables:
sapply(df2, class)
## $CnBio_ID
## [1] "numeric"
## 
## $First_Gift_Date
## [1] "POSIXct" "POSIXt" 
## 
## $Last_Gift_Date
## [1] "POSIXct" "POSIXt" 
## 
## $Largest_Gift_Date
## [1] "POSIXct" "POSIXt" 
## 
## $CnBio_DateAdded
## [1] "POSIXct" "POSIXt" 
## 
## $CnBio_DateChanged
## [1] "POSIXct" "POSIXt" 
## 
## $CnBio_Key_Indicator
## [1] "character"
## 
## $CnBio_Deceased
## [1] "character"
## 
## $CnBio_Title_1
## [1] "character"
## 
## $CnBio_Marital_status
## [1] "character"
## 
## $City
## [1] "character"
## 
## $State
## [1] "character"
## 
## $Reunions_attended
## [1] "numeric"
## 
## $Zip
## [1] "character"
## 
## $CnAdrPrf_Type
## [1] "character"
## 
## $CnCnstncy_1_01_CodeLong
## [1] "character"
## 
## $CnCnstncy_1_02_CodeLong
## [1] "character"
## 
## $CnCnstncy_1_03_CodeLong
## [1] "character"
## 
## $CnCnstncy_1_04_CodeLong
## [1] "character"
## 
## $CnSpPrBs_RecordImportID
## [1] "character"
## 
## $CnRelEdu_1_01_Class_of
## [1] "numeric"
## 
## $CnRelEdu_1_01_Degree
## [1] "character"
## 
## $CnRelEdu_1_02_Degree
## [1] "character"
## 
## $Alumni_Board_Member
## [1] "character"
## 
## $Married_to_an_Alum
## [1] "character"
## 
## $Personal_Email_End
## [1] "character"
## 
## $Total_Lifetime_Giving
## [1] "numeric"
## 
## $Last_10_Years_Giving
## [1] "numeric"
## 
## $Last_5_Years_Giving
## [1] "numeric"
## 
## $CnSpSpBio_ID
## [1] "numeric"
## 
## $Alumni
## [1] "numeric"
#Finding the summary of dataframe:
summary(df2)
##     CnBio_ID         First_Gift_Date                  
##  Min.   :        4   Min.   :1901-01-01 00:00:00.000  
##  1st Qu.:   173994   1st Qu.:1989-10-24 00:00:00.000  
##  Median :   292194   Median :2003-02-17 00:00:00.000  
##  Mean   : 24034109   Mean   :1998-11-12 07:39:31.447  
##  3rd Qu.:   435660   3rd Qu.:2010-10-18 00:00:00.000  
##  Max.   :200003047   Max.   :2022-03-14 00:00:00.000  
##                      NA's   :27073                    
##  Last_Gift_Date                   Largest_Gift_Date               
##  Min.   :1901-01-01 00:00:00.00   Min.   :1901-01-01 00:00:00.00  
##  1st Qu.:1998-03-19 00:00:00.00   1st Qu.:1995-07-31 00:00:00.00  
##  Median :2008-12-18 00:00:00.00   Median :2006-12-07 00:00:00.00  
##  Mean   :2006-04-01 19:16:48.49   Mean   :2003-03-17 06:09:53.45  
##  3rd Qu.:2016-02-29 00:00:00.00   3rd Qu.:2013-04-30 00:00:00.00  
##  Max.   :2022-03-15 00:00:00.00   Max.   :2022-03-14 00:00:00.00  
##  NA's   :27073                    NA's   :27073                   
##  CnBio_DateAdded                  CnBio_DateChanged               
##  Min.   :2017-07-17 00:00:00.00   Min.   :2017-07-18 00:00:00.00  
##  1st Qu.:2017-07-17 00:00:00.00   1st Qu.:2021-11-27 00:00:00.00  
##  Median :2017-07-17 00:00:00.00   Median :2022-02-07 00:00:00.00  
##  Mean   :2017-10-04 02:19:28.86   Mean   :2021-08-05 14:07:51.11  
##  3rd Qu.:2017-07-17 00:00:00.00   3rd Qu.:2022-02-11 00:00:00.00  
##  Max.   :2022-03-17 00:00:00.00   Max.   :2022-03-18 00:00:00.00  
##                                                                   
##  CnBio_Key_Indicator CnBio_Deceased     CnBio_Title_1      CnBio_Marital_status
##  Length:42287        Length:42287       Length:42287       Length:42287        
##  Class :character    Class :character   Class :character   Class :character    
##  Mode  :character    Mode  :character   Mode  :character   Mode  :character    
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##      City              State           Reunions_attended      Zip           
##  Length:42287       Length:42287       Min.   :0.000000   Length:42287      
##  Class :character   Class :character   1st Qu.:0.000000   Class :character  
##  Mode  :character   Mode  :character   Median :0.000000   Mode  :character  
##                                        Mean   :0.007378                     
##                                        3rd Qu.:0.000000                     
##                                        Max.   :6.000000                     
##                                                                             
##  CnAdrPrf_Type      CnCnstncy_1_01_CodeLong CnCnstncy_1_02_CodeLong
##  Length:42287       Length:42287            Length:42287           
##  Class :character   Class :character        Class :character       
##  Mode  :character   Mode  :character        Mode  :character       
##                                                                    
##                                                                    
##                                                                    
##                                                                    
##  CnCnstncy_1_03_CodeLong CnCnstncy_1_04_CodeLong CnSpPrBs_RecordImportID
##  Length:42287            Length:42287            Length:42287           
##  Class :character        Class :character        Class :character       
##  Mode  :character        Mode  :character        Mode  :character       
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##  CnRelEdu_1_01_Class_of CnRelEdu_1_01_Degree CnRelEdu_1_02_Degree
##  Min.   :1929           Length:42287         Length:42287        
##  1st Qu.:1985           Class :character     Class :character    
##  Median :2004           Mode  :character     Mode  :character    
##  Mean   :1997                                                    
##  3rd Qu.:2014                                                    
##  Max.   :2021                                                    
##  NA's   :25089                                                   
##  Alumni_Board_Member Married_to_an_Alum Personal_Email_End
##  Length:42287        Length:42287       Length:42287      
##  Class :character    Class :character   Class :character  
##  Mode  :character    Mode  :character   Mode  :character  
##                                                           
##                                                           
##                                                           
##                                                           
##  Total_Lifetime_Giving Last_10_Years_Giving Last_5_Years_Giving
##  Min.   :       0      Min.   :      0      Min.   :      0    
##  1st Qu.:       0      1st Qu.:      0      1st Qu.:      0    
##  Median :       0      Median :      0      Median :      0    
##  Mean   :    2318      Mean   :    989      Mean   :    456    
##  3rd Qu.:      35      3rd Qu.:      0      3rd Qu.:      0    
##  Max.   :11321975      Max.   :8805800      Max.   :4121000    
##                                                                
##   CnSpSpBio_ID           Alumni      
##  Min.   :     1512   Min.   :0.0000  
##  1st Qu.:   199564   1st Qu.:0.0000  
##  Median :   357472   Median :0.0000  
##  Mean   : 68234755   Mean   :0.4386  
##  3rd Qu.:200000817   3rd Qu.:1.0000  
##  Max.   :200003047   Max.   :1.0000  
##  NA's   :39885
#Getting the column names:
names(df2)
##  [1] "CnBio_ID"                "First_Gift_Date"        
##  [3] "Last_Gift_Date"          "Largest_Gift_Date"      
##  [5] "CnBio_DateAdded"         "CnBio_DateChanged"      
##  [7] "CnBio_Key_Indicator"     "CnBio_Deceased"         
##  [9] "CnBio_Title_1"           "CnBio_Marital_status"   
## [11] "City"                    "State"                  
## [13] "Reunions_attended"       "Zip"                    
## [15] "CnAdrPrf_Type"           "CnCnstncy_1_01_CodeLong"
## [17] "CnCnstncy_1_02_CodeLong" "CnCnstncy_1_03_CodeLong"
## [19] "CnCnstncy_1_04_CodeLong" "CnSpPrBs_RecordImportID"
## [21] "CnRelEdu_1_01_Class_of"  "CnRelEdu_1_01_Degree"   
## [23] "CnRelEdu_1_02_Degree"    "Alumni_Board_Member"    
## [25] "Married_to_an_Alum"      "Personal_Email_End"     
## [27] "Total_Lifetime_Giving"   "Last_10_Years_Giving"   
## [29] "Last_5_Years_Giving"     "CnSpSpBio_ID"           
## [31] "Alumni"

#Feature Engineering:

#After analyzing the data, group all the similar features in same category using encoding method:

# create a new column with grouped values for CnCnstncy_1_01_CodeLong: 
df2$grouped_CnCnstncy_1_01_CodeLong <- ifelse(df2$CnCnstncy_1_01_CodeLong %in% c("Board", "Previous Board"), "Board", 
                                  ifelse(df2$CnCnstncy_1_01_CodeLong %in% c("Current Fac/Staff", "Former Fac/Staff"), "Fac/Staff", 
                                  ifelse(df2$CnCnstncy_1_01_CodeLong %in% c("Student", "Parent", "Education Certificate"), "Education/Family", 
                                  ifelse(df2$CnCnstncy_1_01_CodeLong %in% c("Friend", "Friends / Memorial"), "Friendship/Memorial", 
                                  ifelse(df2$CnCnstncy_1_01_CodeLong %in% c("Prospective Benefactor", "Organization", "Alumni", "Business", "Foundation", "Trust / Business", "Dominican Colleges and Universities", "WAICU", "Government", "Religious Org", "Unknown - Historical"), "OtherDonors", "NA")))))

# encode the labels using factor()
# encode the labels using factor() and replace NA and 0 values with 0

df2$CnCnstncy_1_01_CodeLong_Encoded <- ifelse(is.na(df2$grouped_CnCnstncy_1_01_CodeLong) | df2$grouped_CnCnstncy_1_01_CodeLong == "NA" | df2$grouped_CnCnstncy_1_01_CodeLong == 0, 0,
                                              as.integer(factor(df2$grouped_CnCnstncy_1_01_CodeLong,
                                                                 levels = c("Education/Family", "Friendship/Memorial", "Fac/Staff", "Board", "OtherDonors"),
                                                                 labels = c(1, 2, 3, 4, 5))))
#After analyzing the data, group all the similar features in same category using encoding method:

# create a new column with grouped values for CnCnstncy_1_02_CodeLong:
df2$grouped_CnCnstncy_1_02_CodeLong <- ifelse(df2$CnCnstncy_1_02_CodeLong %in% c("Board", "Previous Board", "Board of Visitors Advisory Group"), "Board", 
                                  ifelse(df2$CnCnstncy_1_02_CodeLong %in% c("Current Fac/Staff", "Former Fac/Staff"), "Fac/Staff", 
                                  ifelse(df2$CnCnstncy_1_02_CodeLong %in% c("Student", "Parent", "Education Certificate"), "Education/Family", 
                                  ifelse(df2$CnCnstncy_1_02_CodeLong %in% c("Friend", "Friends / Memorial", "Friends / Athletics", "Friends / Agency"), "Friendship/Memorial", 
                                  ifelse(df2$CnCnstncy_1_02_CodeLong %in% c("Prospective Benefactor", "Organization", "Alumni", "Business", "Foundation", "Trust", "Cutting Edge Alumni", "Religious Org", "Unknown - Historical"), "OtherDonors", "NA")))))

# encode the labels using factor()
# encode the labels using factor() and replace NA and 0 values with 0
df2$CnCnstncy_1_02_CodeLong_Encoded <- ifelse(is.na(df2$grouped_CnCnstncy_1_02_CodeLong) | df2$grouped_CnCnstncy_1_02_CodeLong == "NA" | df2$grouped_CnCnstncy_1_02_CodeLong == 0, 0,
                                              as.integer(factor(df2$grouped_CnCnstncy_1_02_CodeLong,
                                  levels = c("Education/Family", "Friendship/Memorial", "Fac/Staff", "Board", "OtherDonors"),
                                  labels = c(1, 2, 3, 4, 5))))
#After analyzing the data, group all the similar features in same category using encoding method:

# create a new column with grouped values for CnCnstncy_1_03_CodeLong:
df2$grouped_CnCnstncy_1_03_CodeLong <- ifelse(df2$CnCnstncy_1_03_CodeLong %in% c("Board", "Previous Board"), "Board", 
                                  ifelse(df2$CnCnstncy_1_03_CodeLong %in% c("Current Fac/Staff", "Former Fac/Staff"), "Fac/Staff", 
                                  ifelse(df2$CnCnstncy_1_03_CodeLong %in% c("Student", "Parent", "Education Certificate"), "Education/Family", 
                                  ifelse(df2$CnCnstncy_1_03_CodeLong %in% c("Friend", "Friends / Memorial"), "Friendship/Memorial", 
                                  ifelse(df2$CnCnstncy_1_03_CodeLong %in% c("Prospective Benefactor", "Organization", "Alumni"), "OtherDonors", "NA")))))

# encode the labels using factor() and replace NA and 0 values with 0
df2$CnCnstncy_1_03_CodeLong_Encoded <- ifelse(is.na(df2$grouped_CnCnstncy_1_03_CodeLong) | df2$grouped_CnCnstncy_1_03_CodeLong == "NA" | df2$grouped_CnCnstncy_1_03_CodeLong == 0, 0,
                                  as.integer(factor(df2$grouped_CnCnstncy_1_03_CodeLong,
                                  levels = c("Education/Family", "Friendship/Memorial", "Fac/Staff", "Board", "OtherDonors"),
                                  labels = c(1, 2, 3, 4, 5))))
#After analyzing the data, group all the similar features in same category using encoding method:

# Create a mapping between the original labels and new encoded labels for CnCnstncy_1_04_CodeLong:
label_mapping <- c("NA" = 0, "Former Fac/Staff" = 1, "Previous Board" = 2, "Education Certificate" = 3)

# Replace missing values with "NA" label
df2$CnCnstncy_1_04_CodeLong[is.na(df2$CnCnstncy_1_04_CodeLong)] <- "NA"

# Convert the column to a factor with the new encoding
df2$CnCnstncy_1_04_CodeLong_Encoded <- factor(df2$CnCnstncy_1_04_CodeLong, labels = label_mapping)
#After analyzing the data, group all the similar features in same category using encoding method:

# create a new column with grouped values for CnBio_Marital_status:
df2$grouped_CnBio_Marital_status <- ifelse(df2$CnBio_Marital_status %in% c("Married", "Partner", "Cohabitation", "Engaged"), "Committed", 
                                           ifelse(df2$CnBio_Marital_status %in% c("Divorced", "Single", "Widowed", "Separated"), "Single", 
                                                  ifelse(df2$CnBio_Marital_status %in% c("Religious"), "Worshipper", 
                                                         ifelse(is.na(df2$CnBio_Marital_status) | df2$CnBio_Marital_status == "Unknown", "Unknowns", "NA"))))

# encode the labels using factor() and replace NA and 0 values with 0
df2$CnBio_Marital_status_Encoded <- ifelse(is.na(df2$grouped_CnBio_Marital_status) | df2$grouped_CnBio_Marital_status == "NA" | df2$grouped_CnBio_Marital_status == "Unknowns", 0,
                                            as.integer(factor(df2$grouped_CnBio_Marital_status,
                                                               levels = c("Committed", "Single", "Worshipper", "Unknowns"),
                                                               labels = c(1, 2, 3, 4))))
##Label Encoding to States Column:

## First find top 5 and least 5 from total_lifetime, 5years and 10 years and seee consistent or not
# Convert the category column to a factor
df2$State <- factor(df2$State)

# Convert the factor levels to integer values using as.integer()
df2$State_encoded <- as.integer(df2$State)

# Replace null and NA values with 0 using ifelse() and is.na()
df2$State_encoded <- ifelse(is.na(df2$State_encoded), 0, df2$State_encoded)

# View the resulting data frame
df2
#Replacing for all the values of alumni board member to 1 and 0 for not there:
df2$Alumni_Board_Member <- ifelse(!is.na(df2$Alumni_Board_Member) & df2$Alumni_Board_Member != "", 1, 0)
# Replace "O" with 0 and "l" with 1 in CnBio_Key_Deceased column:
df2$CnBio_Deceased <- ifelse(df2$CnBio_Deceased == "No", 0, 1)

# print updated data frame
print(df2)
## # A tibble: 42,287 × 41
##     CnBio_ID First_Gift_Date Last_Gift_Date Largest_Gift_Date
##        <dbl> <dttm>          <dttm>         <dttm>           
##  1 200001488 NA              NA             NA               
##  2 200001489 NA              NA             NA               
##  3 200001490 NA              NA             NA               
##  4 200001491 NA              NA             NA               
##  5 200001492 NA              NA             NA               
##  6 200001493 NA              NA             NA               
##  7 200001494 NA              NA             NA               
##  8 200001495 NA              NA             NA               
##  9 200001496 NA              NA             NA               
## 10 200001497 NA              NA             NA               
## # ℹ 42,277 more rows
## # ℹ 37 more variables: CnBio_DateAdded <dttm>, CnBio_DateChanged <dttm>,
## #   CnBio_Key_Indicator <chr>, CnBio_Deceased <dbl>, CnBio_Title_1 <chr>,
## #   CnBio_Marital_status <chr>, City <chr>, State <fct>,
## #   Reunions_attended <dbl>, Zip <chr>, CnAdrPrf_Type <chr>,
## #   CnCnstncy_1_01_CodeLong <chr>, CnCnstncy_1_02_CodeLong <chr>,
## #   CnCnstncy_1_03_CodeLong <chr>, CnCnstncy_1_04_CodeLong <chr>, …
# replace "O" with 0 and "l" with 1 in CnBio_Key_Indicator column
df2$CnBio_Key_Indicator <- ifelse(df2$CnBio_Key_Indicator == "O", 0, 1)
## Any Degrees Present in Education:
df2$Any_Degree_Present <- ifelse((is.na(df2$CnRelEdu_1_01_Degree) & is.na(df2$CnRelEdu_1_02_Degree)) | 
                                 (df2$CnRelEdu_1_01_Degree %in% c("None", "Unknown")) | 
                                 (df2$CnRelEdu_1_02_Degree %in% c("None", "Unknown")), 0,
                                 ifelse(!is.na(df2$CnRelEdu_1_01_Degree) & !is.na(df2$CnRelEdu_1_02_Degree), 2,1))
# Extract year and month from datetime
df2$First_Gift_Year <- year(df2$First_Gift_Date)
df2$First_Gift_Month <- month(df2$First_Gift_Date)

#Data Cleaning:

# Remove the rows with 0 in column 'Total_Lifetime_Giving' which is the target variable to remove distortion in the analysis:

df2 <- df2[df2$Total_Lifetime_Giving != 0, ]

# print new dataframe
print(df2)
## # A tibble: 15,116 × 44
##     CnBio_ID First_Gift_Date     Last_Gift_Date      Largest_Gift_Date  
##        <dbl> <dttm>              <dttm>              <dttm>             
##  1 100001361 1973-02-01 00:00:00 2021-12-27 00:00:00 2021-12-27 00:00:00
##  2 100001363 1986-08-11 00:00:00 2014-06-03 00:00:00 1986-08-11 00:00:00
##  3 100001364 1971-12-26 00:00:00 2008-11-10 00:00:00 1987-11-19 00:00:00
##  4 100001366 1977-09-17 00:00:00 2018-04-22 00:00:00 2016-04-24 00:00:00
##  5 100001368 1985-06-10 00:00:00 1996-11-06 00:00:00 1985-06-10 00:00:00
##  6 100001373 1977-05-31 00:00:00 2020-03-16 00:00:00 2006-11-14 00:00:00
##  7 100001376 1977-05-24 00:00:00 2021-11-24 00:00:00 2021-11-24 00:00:00
##  8 100001378 1968-10-05 00:00:00 2020-06-08 00:00:00 1991-03-13 00:00:00
##  9    341286 2009-05-11 00:00:00 2011-11-11 00:00:00 2011-11-11 00:00:00
## 10    337744 2020-05-16 00:00:00 2020-05-16 00:00:00 2020-05-16 00:00:00
## # ℹ 15,106 more rows
## # ℹ 40 more variables: CnBio_DateAdded <dttm>, CnBio_DateChanged <dttm>,
## #   CnBio_Key_Indicator <dbl>, CnBio_Deceased <dbl>, CnBio_Title_1 <chr>,
## #   CnBio_Marital_status <chr>, City <chr>, State <fct>,
## #   Reunions_attended <dbl>, Zip <chr>, CnAdrPrf_Type <chr>,
## #   CnCnstncy_1_01_CodeLong <chr>, CnCnstncy_1_02_CodeLong <chr>,
## #   CnCnstncy_1_03_CodeLong <chr>, CnCnstncy_1_04_CodeLong <chr>, …
#Dropping the irrelevant columns for the donation purpose:
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df2 <- df2 %>% select(-CnSpSpBio_ID,-Personal_Email_End,-Largest_Gift_Date,-Zip,-CnSpPrBs_RecordImportID,-CnBio_Title_1,-City,-Last_Gift_Date,-Married_to_an_Alum,-CnRelEdu_1_01_Class_of)
#library(dplyr)
#df2 <- df2 %>% select(-CnRelEdu_1_01_Class_of)
#Remove the maximum value row from Total_Lifetime_Giving
max_row <- which.max(df2$Total_Lifetime_Giving)
df2 <- df2[-max_row, ]
#Taking only numeric variables:
numeric_df <- subset(df2, select = which(sapply(df2, is.numeric)))
# compute the correlation matrix between all pairs of numeric columns
print(cor(numeric_df))
##                                     CnBio_ID CnBio_Key_Indicator CnBio_Deceased
## CnBio_ID                         1.000000000         -0.02707257    0.173067828
## CnBio_Key_Indicator             -0.027072570          1.00000000    0.157417708
## CnBio_Deceased                   0.173067828          0.15741771    1.000000000
## Reunions_attended                0.011722215          0.03607486   -0.037181403
## Alumni_Board_Member             -0.018515463          0.01404689   -0.015135426
## Total_Lifetime_Giving            0.010025236         -0.06476337   -0.005745462
## Last_10_Years_Giving            -0.000172846         -0.08877176   -0.025284441
## Last_5_Years_Giving              0.021639108         -0.05990592   -0.023797930
## Alumni                          -0.075115148          0.24895521   -0.017145338
## CnCnstncy_1_01_CodeLong_Encoded -0.039611696         -0.27438305   -0.043194507
## CnCnstncy_1_02_CodeLong_Encoded -0.016985372          0.03112182   -0.018741522
## CnCnstncy_1_03_CodeLong_Encoded -0.027017278          0.02799232   -0.008792243
## CnBio_Marital_status_Encoded    -0.060076252          0.26824803    0.072669025
## State_encoded                   -0.134263327         -0.02439766   -0.094301389
## Any_Degree_Present              -0.105045861          0.23137718   -0.058129480
## First_Gift_Year                 -0.136690635         -0.01927302   -0.296002621
## First_Gift_Month                 0.036036393         -0.03994877   -0.052067518
##                                 Reunions_attended Alumni_Board_Member
## CnBio_ID                              0.011722215       -0.0185154632
## CnBio_Key_Indicator                   0.036074863        0.0140468939
## CnBio_Deceased                       -0.037181403       -0.0151354262
## Reunions_attended                     1.000000000        0.0748671877
## Alumni_Board_Member                   0.074867188        1.0000000000
## Total_Lifetime_Giving                 0.003519344        0.0008028766
## Last_10_Years_Giving                  0.014176013        0.0020996682
## Last_5_Years_Giving                   0.026596257        0.0059125106
## Alumni                                0.079893332        0.0564233770
## CnCnstncy_1_01_CodeLong_Encoded       0.076210996        0.0432673310
## CnCnstncy_1_02_CodeLong_Encoded       0.100902301        0.0179823034
## CnCnstncy_1_03_CodeLong_Encoded       0.159452322        0.0200510561
## CnBio_Marital_status_Encoded          0.100662864        0.0297046868
## State_encoded                         0.011511602        0.0157708281
## Any_Degree_Present                    0.074956572        0.0740118917
## First_Gift_Year                      -0.142611822       -0.0108378874
## First_Gift_Month                     -0.028161465        0.0054782915
##                                 Total_Lifetime_Giving Last_10_Years_Giving
## CnBio_ID                                 0.0100252364        -1.728460e-04
## CnBio_Key_Indicator                     -0.0647633747        -8.877176e-02
## CnBio_Deceased                          -0.0057454619        -2.528444e-02
## Reunions_attended                        0.0035193443         1.417601e-02
## Alumni_Board_Member                      0.0008028766         2.099668e-03
## Total_Lifetime_Giving                    1.0000000000         5.920496e-01
## Last_10_Years_Giving                     0.5920495964         1.000000e+00
## Last_5_Years_Giving                      0.4190867332         6.422503e-01
## Alumni                                  -0.0230186643        -3.200903e-02
## CnCnstncy_1_01_CodeLong_Encoded          0.0154420560         1.550557e-02
## CnCnstncy_1_02_CodeLong_Encoded          0.0297843131         2.197695e-02
## CnCnstncy_1_03_CodeLong_Encoded          0.0713673700         1.404653e-03
## CnBio_Marital_status_Encoded             0.0106121228         3.156821e-05
## State_encoded                            0.0040689666        -3.191102e-03
## Any_Degree_Present                      -0.0155931585        -2.995966e-02
## First_Gift_Year                         -0.0276762147         9.896727e-03
## First_Gift_Month                         0.0113455100         2.590825e-02
##                                 Last_5_Years_Giving      Alumni
## CnBio_ID                                0.021639108 -0.07511515
## CnBio_Key_Indicator                    -0.059905923  0.24895521
## CnBio_Deceased                         -0.023797930 -0.01714534
## Reunions_attended                       0.026596257  0.07989333
## Alumni_Board_Member                     0.005912511  0.05642338
## Total_Lifetime_Giving                   0.419086733 -0.02301866
## Last_10_Years_Giving                    0.642250279 -0.03200903
## Last_5_Years_Giving                     1.000000000 -0.02264843
## Alumni                                 -0.022648431  1.00000000
## CnCnstncy_1_01_CodeLong_Encoded         0.008444149  0.76569753
## CnCnstncy_1_02_CodeLong_Encoded         0.026330669  0.32290206
## CnCnstncy_1_03_CodeLong_Encoded        -0.002389547  0.08729421
## CnBio_Marital_status_Encoded            0.003896422  0.39861807
## State_encoded                          -0.004823998 -0.04192007
## Any_Degree_Present                     -0.021808978  0.89129643
## First_Gift_Year                         0.006392624 -0.14787781
## First_Gift_Month                        0.021706413 -0.02750813
##                                 CnCnstncy_1_01_CodeLong_Encoded
## CnBio_ID                                           -0.039611696
## CnBio_Key_Indicator                                -0.274383052
## CnBio_Deceased                                     -0.043194507
## Reunions_attended                                   0.076210996
## Alumni_Board_Member                                 0.043267331
## Total_Lifetime_Giving                               0.015442056
## Last_10_Years_Giving                                0.015505574
## Last_5_Years_Giving                                 0.008444149
## Alumni                                              0.765697533
## CnCnstncy_1_01_CodeLong_Encoded                     1.000000000
## CnCnstncy_1_02_CodeLong_Encoded                     0.282285411
## CnCnstncy_1_03_CodeLong_Encoded                     0.069207481
## CnBio_Marital_status_Encoded                        0.234442371
## State_encoded                                      -0.017327307
## Any_Degree_Present                                  0.678909063
## First_Gift_Year                                    -0.224824614
## First_Gift_Month                                   -0.015940561
##                                 CnCnstncy_1_02_CodeLong_Encoded
## CnBio_ID                                            -0.01698537
## CnBio_Key_Indicator                                  0.03112182
## CnBio_Deceased                                      -0.01874152
## Reunions_attended                                    0.10090230
## Alumni_Board_Member                                  0.01798230
## Total_Lifetime_Giving                                0.02978431
## Last_10_Years_Giving                                 0.02197695
## Last_5_Years_Giving                                  0.02633067
## Alumni                                               0.32290206
## CnCnstncy_1_01_CodeLong_Encoded                      0.28228541
## CnCnstncy_1_02_CodeLong_Encoded                      1.00000000
## CnCnstncy_1_03_CodeLong_Encoded                      0.22385491
## CnBio_Marital_status_Encoded                         0.18261191
## State_encoded                                        0.02382638
## Any_Degree_Present                                   0.30161287
## First_Gift_Year                                     -0.10473342
## First_Gift_Month                                    -0.01918667
##                                 CnCnstncy_1_03_CodeLong_Encoded
## CnBio_ID                                           -0.027017278
## CnBio_Key_Indicator                                 0.027992322
## CnBio_Deceased                                     -0.008792243
## Reunions_attended                                   0.159452322
## Alumni_Board_Member                                 0.020051056
## Total_Lifetime_Giving                               0.071367370
## Last_10_Years_Giving                                0.001404653
## Last_5_Years_Giving                                -0.002389547
## Alumni                                              0.087294213
## CnCnstncy_1_01_CodeLong_Encoded                     0.069207481
## CnCnstncy_1_02_CodeLong_Encoded                     0.223854913
## CnCnstncy_1_03_CodeLong_Encoded                     1.000000000
## CnBio_Marital_status_Encoded                        0.072242463
## State_encoded                                       0.024987907
## Any_Degree_Present                                  0.099180314
## First_Gift_Year                                    -0.058744451
## First_Gift_Month                                   -0.011761106
##                                 CnBio_Marital_status_Encoded State_encoded
## CnBio_ID                                       -6.007625e-02  -0.134263327
## CnBio_Key_Indicator                             2.682480e-01  -0.024397660
## CnBio_Deceased                                  7.266903e-02  -0.094301389
## Reunions_attended                               1.006629e-01   0.011511602
## Alumni_Board_Member                             2.970469e-02   0.015770828
## Total_Lifetime_Giving                           1.061212e-02   0.004068967
## Last_10_Years_Giving                            3.156821e-05  -0.003191102
## Last_5_Years_Giving                             3.896422e-03  -0.004823998
## Alumni                                          3.986181e-01  -0.041920074
## CnCnstncy_1_01_CodeLong_Encoded                 2.344424e-01  -0.017327307
## CnCnstncy_1_02_CodeLong_Encoded                 1.826119e-01   0.023826376
## CnCnstncy_1_03_CodeLong_Encoded                 7.224246e-02   0.024987907
## CnBio_Marital_status_Encoded                    1.000000e+00   0.017226068
## State_encoded                                   1.722607e-02   1.000000000
## Any_Degree_Present                              3.614488e-01  -0.012859653
## First_Gift_Year                                -1.903332e-01   0.079489176
## First_Gift_Month                               -3.643123e-02   0.028969371
##                                 Any_Degree_Present First_Gift_Year
## CnBio_ID                               -0.10504586    -0.136690635
## CnBio_Key_Indicator                     0.23137718    -0.019273023
## CnBio_Deceased                         -0.05812948    -0.296002621
## Reunions_attended                       0.07495657    -0.142611822
## Alumni_Board_Member                     0.07401189    -0.010837887
## Total_Lifetime_Giving                  -0.01559316    -0.027676215
## Last_10_Years_Giving                   -0.02995966     0.009896727
## Last_5_Years_Giving                    -0.02180898     0.006392624
## Alumni                                  0.89129643    -0.147877814
## CnCnstncy_1_01_CodeLong_Encoded         0.67890906    -0.224824614
## CnCnstncy_1_02_CodeLong_Encoded         0.30161287    -0.104733420
## CnCnstncy_1_03_CodeLong_Encoded         0.09918031    -0.058744451
## CnBio_Marital_status_Encoded            0.36144880    -0.190333220
## State_encoded                          -0.01285965     0.079489176
## Any_Degree_Present                      1.00000000    -0.118145454
## First_Gift_Year                        -0.11814545     1.000000000
## First_Gift_Month                       -0.02140397     0.180048917
##                                 First_Gift_Month
## CnBio_ID                             0.036036393
## CnBio_Key_Indicator                 -0.039948769
## CnBio_Deceased                      -0.052067518
## Reunions_attended                   -0.028161465
## Alumni_Board_Member                  0.005478291
## Total_Lifetime_Giving                0.011345510
## Last_10_Years_Giving                 0.025908253
## Last_5_Years_Giving                  0.021706413
## Alumni                              -0.027508133
## CnCnstncy_1_01_CodeLong_Encoded     -0.015940561
## CnCnstncy_1_02_CodeLong_Encoded     -0.019186667
## CnCnstncy_1_03_CodeLong_Encoded     -0.011761106
## CnBio_Marital_status_Encoded        -0.036431225
## State_encoded                        0.028969371
## Any_Degree_Present                  -0.021403968
## First_Gift_Year                      0.180048917
## First_Gift_Month                     1.000000000
cor_matrix <- cor(numeric_df)

# create a heatmap of the correlation matrix
heatmap(cor_matrix, 
        Rowv = NA, Colv = NA,     # turn off row and column dendrograms
        symm = TRUE,              # use symmetric color scale
        margins = c(10, 10))  

#Exploratory Data Analysis:

##Graph for :CnCnstncy_1_01_CodeLong_Encoded:

# Group the data by CnCnstncy_1_01_CodeLong and calculate the total count
grouped_df <- df2 %>% 
  group_by(CnCnstncy_1_01_CodeLong) %>% 
  summarize(total_count = n())

# Create the pie chart
ggplot(grouped_df, aes(x = "", y = total_count, fill = CnCnstncy_1_01_CodeLong)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  labs(x = NULL, y = NULL, fill = "CnCnstncy_1_01_CodeLong", 
       title = "Distribution of CnCnstncy_1_01_CodeLong") +
  theme_void()

##Graph for :CnCnstncy_1_02_CodeLong_Encoded:

# Group the data by CnCnstncy_1_02_CodeLong and calculate the total count
grouped_df <- df2 %>% 
  group_by(CnCnstncy_1_02_CodeLong) %>% 
  summarize(total_count = n())

# Create the pie chart
ggplot(grouped_df, aes(x = "", y = total_count, fill = CnCnstncy_1_02_CodeLong)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  labs(x = NULL, y = NULL, fill = "CnCnstncy_1_02_CodeLong", 
       title = "Distribution of CnCnstncy_1_02_CodeLong") +
  theme_void()

# create scatter plot
ggplot(df2, aes(x = df2$First_Gift_Year, y = df2$Total_Lifetime_Giving)) +
  geom_point() +
  scale_y_continuous(limits = c(0, 110000)) +
  labs(x = "First_Gift_Year Axis Label", y = "Total_Lifetime_Giving", title = "Scatter Plot Title")
## Warning: Use of `df2$First_Gift_Year` is discouraged.
## ℹ Use `First_Gift_Year` instead.
## Warning: Use of `df2$Total_Lifetime_Giving` is discouraged.
## ℹ Use `Total_Lifetime_Giving` instead.
## Warning: Removed 131 rows containing missing values (`geom_point()`).

# Count the frequency of each value
freq <- table(df2$Alumni)

# Create a dataframe with the counts and percentages
df_freq <- data.frame(value = as.numeric(freq), 
                      percentage = round(as.numeric(freq) / sum(as.numeric(freq)) * 100, 1))

# Create the pie chart with percentage labels
ggplot(data = df_freq, aes(x = "", y = value, fill = factor(value))) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(percentage, "%")), 
            position = position_stack(vjust = 0.5)) +
  labs(title = "Alumni Member Distribution") +
  scale_fill_manual(values = c("Pink", "Yellow"), 
                    labels = c("Non-Alumni Member", "Alumni Board Member")) +
  theme_void()

# Group the data by year and calculate the total lifetime giving for each year
total_giving_by_year <- df2 %>%
  group_by(First_Gift_Year) %>%
  summarize(Total_Lifetime_Giving = sum(Total_Lifetime_Giving))

ggplot(total_giving_by_year, aes(x = First_Gift_Year, y = Total_Lifetime_Giving)) +
  geom_line(color = "purple") +
  labs(x = "First Gift Year", y = "Total Lifetime Giving", title = "First Gift Year vs Total Lifetime Giving")

##Dropping the rows as the values in The First Gift Year from 1900 to 1940:
library(dplyr)

numeric_df <- numeric_df %>%
  filter(First_Gift_Year > 1940)
# Group the data by year and calculate the total lifetime giving for each year
total_giving_by_month <- numeric_df %>%
  group_by(First_Gift_Month) %>%
  summarize(Total_Lifetime_Giving = sum(Total_Lifetime_Giving))

ggplot(total_giving_by_month, aes(x = First_Gift_Month, y = Total_Lifetime_Giving)) +
  geom_line() +
  labs(x = "First Gift Month", y = "Total Lifetime Giving", title = "First Gift Month vs Total Lifetime Giving")

library(ggplot2)
##1
# Group the data by Year and Alumni and calculate the total lifetime giving
grouped_df <- df2 %>% 
  group_by(First_Gift_Year, Alumni) %>% 
  summarize(total_lifetime_giving = sum(Total_Lifetime_Giving))
## `summarise()` has grouped output by 'First_Gift_Year'. You can override using
## the `.groups` argument.
# Create the bubble chart
ggplot(grouped_df, aes(x = First_Gift_Year ,y = total_lifetime_giving, size = total_lifetime_giving, color = Alumni)) +
  geom_point() +
  scale_size(range = c(1, 10)) +
  labs(x = "Year", y = "Total Lifetime Giving", 
       title = "Bubble chart of Total Lifetime Giving by Year and Alumni")

##2

# Group the data by Year and Any_Degree_Present  and calculate the total lifetime giving
grouped_df <- df2 %>% 
  group_by(First_Gift_Year,Any_Degree_Present ) %>% 
  summarize(total_lifetime_giving = sum(Total_Lifetime_Giving))
## `summarise()` has grouped output by 'First_Gift_Year'. You can override using
## the `.groups` argument.
# Create the bubble chart
ggplot(grouped_df, aes(x = First_Gift_Year ,y = total_lifetime_giving, size = total_lifetime_giving, color = Any_Degree_Present)) +
  geom_point() +
  scale_size(range = c(1, 10)) +
  labs(x = "Year", y = "Total Lifetime Giving", 
       title = "Bubble chart of Total Lifetime Giving by Year and Alumni Board Member")

##1

# create the linear graph
ggplot(df2, aes(CnCnstncy_1_01_CodeLong_Encoded, Total_Lifetime_Giving)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "CnCnstncy_1_01_CodeLong_Encoded", y = "Total_Lifetime_Giving", title = "Linear Graph Title")
## `geom_smooth()` using formula = 'y ~ x'

##2
# create the linear graph
ggplot(df2, aes(First_Gift_Year, Total_Lifetime_Giving)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "First_Gift_Year", y = "Total_Lifetime_Giving", title = "Linear Graph Title")
## `geom_smooth()` using formula = 'y ~ x'

##3
# create the linear graph
ggplot(df2, aes(Last_10_Years_Giving, Total_Lifetime_Giving)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Last_10_Years_Giving", y = "Total_Lifetime_Giving", title = "Linear Graph Title")
## `geom_smooth()` using formula = 'y ~ x'

library(dplyr) # load the dplyr package for data manipulation

# sort the data frame by Total_Lifetime_Giving in descending order, and select the top 10 states
topstates <- df2 %>%
  arrange(desc(Total_Lifetime_Giving)) %>%
  slice(1:40) %>%
  pull(State)

# create a scatter plot for the top states
ggplot(data = filter(df2, State %in% topstates),
       aes(x = State, y = Total_Lifetime_Giving)) +
  geom_point() +
  labs(x = "State", y = "Total Lifetime Giving") +
  ylim(0, 1200000) # adjust the y-axis limits to accommodate the highest value
## Warning: Removed 7 rows containing missing values (`geom_point()`).

library(ggplot2)
library(dplyr)
##Topwith Total 
df_top10 <- df2 %>%
  group_by(State) %>%
  summarise(Total_Lifetime_Giving = sum(Total_Lifetime_Giving)) %>%
  filter(State != "N/A") %>%
  arrange(desc(Total_Lifetime_Giving)) %>%
  head(10)

ggplot(df_top10, aes(x = reorder(State, Total_Lifetime_Giving), y = Total_Lifetime_Giving)) +
  geom_bar(stat = "identity", fill = "blue") +
  ggtitle("Top 10 Highest States by Total Lifetime Giving") +
  xlab("State") +
  ylab("Total Lifetime Giving") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

##bottom with total
df_bottom10 <- df2 %>%
  group_by(State) %>%
  summarise(Total_Lifetime_Giving = sum(Total_Lifetime_Giving)) %>%
  filter(State != "N/A") %>%
  arrange(Total_Lifetime_Giving) %>%
  head(10)

ggplot(df_bottom10, aes(x = reorder(State, Total_Lifetime_Giving), y = Total_Lifetime_Giving)) +
  geom_bar(stat = "identity", fill = "pink") +
  ggtitle("Top 10 Lowest States by Total Lifetime Giving") +
  xlab("State") +
  ylab("Total Lifetime Giving") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#Top with 5years


df_top10 <- df2 %>%
  group_by(State) %>%
  summarise(Last_5_Years_Giving = sum(Last_5_Years_Giving)) %>%
  filter(State != "N/A") %>%
  arrange(desc(Last_5_Years_Giving)) %>%
  head(10)

ggplot(df_top10, aes(x = reorder(State, Last_5_Years_Giving), y = Last_5_Years_Giving)) +
  geom_bar(stat = "identity", fill = "blue") +
  ggtitle("Top 10 Highest States by Last_5_Years_Giving") +
  xlab("State") +
  ylab("Last_5_Years_Giving") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#bottom with the 5 years

df_bottom10 <- df2 %>%
  group_by(State) %>%
  summarise(Last_5_Years_Giving = sum(Last_5_Years_Giving)) %>%
  filter(State != "N/A") %>%
  arrange(Last_5_Years_Giving) %>%
  head(10)

ggplot(df_bottom10, aes(x = reorder(State, Last_5_Years_Giving), y = Last_5_Years_Giving)) +
  geom_bar(stat = "identity", fill = "pink") +
  ggtitle("Top 10 Lowest States by Last_5_Years_Giving") +
  xlab("State") +
  ylab("Last_5_Years_Giving") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#top with 10 years
df_top10 <- df2 %>%
  group_by(State) %>%
  summarise(Last_10_Years_Giving = sum(Last_10_Years_Giving)) %>%
  filter(State != "N/A") %>%
  arrange(desc(Last_10_Years_Giving)) %>%
  head(10)

ggplot(df_top10, aes(x = reorder(State, Last_10_Years_Giving), y = Last_10_Years_Giving)) +
  geom_bar(stat = "identity", fill = "blue") +
  ggtitle("Top 10 Highest States by Last_10_Years_Giving") +
  xlab("State") +
  ylab("Last_10_Years_Giving") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#Bottom 10 with Last 10 years
df_bottom10 <- df2 %>%
  group_by(State) %>%
  summarise(Last_10_Years_Giving = sum(Last_10_Years_Giving)) %>%
  filter(State != "N/A") %>%
  arrange(Last_10_Years_Giving) %>%
  head(10)

ggplot(df_bottom10, aes(x = reorder(State, Last_10_Years_Giving), y = Last_10_Years_Giving)) +
  geom_bar(stat = "identity", fill = "pink") +
  ggtitle("Top 10 Lowest States by Last_10_Years_Giving") +
  xlab("State") +
  ylab("Last_10_Years_Giving") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#1

ggplot(data=numeric_df, aes(x=Total_Lifetime_Giving)) +
  geom_histogram(binwidth=100000, fill= "orange") +
  xlim(10000, 1000000)
## Warning: Removed 14326 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

#2
ggplot(data=numeric_df, aes(x=Total_Lifetime_Giving)) +
  geom_histogram(binwidth=10000, fill="darkgreen") +
  xlim(10000, 260000)
## Warning: Removed 14375 rows containing non-finite values (`stat_bin()`).
## Removed 2 rows containing missing values (`geom_bar()`).

#3
ggplot(data=numeric_df, aes(x=Total_Lifetime_Giving)) +
  geom_histogram(binwidth=100000, fill="darkblue") +
  xlim(100000, 7000000)
## Warning: Removed 14790 rows containing non-finite values (`stat_bin()`).
## Removed 2 rows containing missing values (`geom_bar()`).

df=numeric_df

dim(df)
## [1] 14927    17
df$log_Total_Lifetime_Giving <- log(df$Total_Lifetime_Giving)
# count missing values
colSums(is.na(numeric_df))
##                        CnBio_ID             CnBio_Key_Indicator 
##                               0                               0 
##                  CnBio_Deceased               Reunions_attended 
##                               0                               0 
##             Alumni_Board_Member           Total_Lifetime_Giving 
##                               0                               0 
##            Last_10_Years_Giving             Last_5_Years_Giving 
##                               0                               0 
##                          Alumni CnCnstncy_1_01_CodeLong_Encoded 
##                               0                               0 
## CnCnstncy_1_02_CodeLong_Encoded CnCnstncy_1_03_CodeLong_Encoded 
##                               0                               0 
##    CnBio_Marital_status_Encoded                   State_encoded 
##                               0                               0 
##              Any_Degree_Present                 First_Gift_Year 
##                               0                               0 
##                First_Gift_Month 
##                               0
library(dplyr)
#numeric_df <- numeric_df %>% select(-CnRelEdu_1_01_Class_of)

#numeric_df <- numeric_df %>% select(-CnSpSpBio_ID)
# Build a linear regression model
model <- lm(Total_Lifetime_Giving ~ ., data = numeric_df)

# Get the fitted values and residuals
fitted <- predict(model)
residuals <- residuals(model)

# Create a data frame with the fitted values and residuals
plot_df <- data.frame(Fitted_Values = fitted, Residuals = residuals)

# Plot the residuals against the fitted values
ggplot(plot_df, aes(Fitted_Values, Residuals)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(x = "Fitted Values", y = "Residuals", title = "Residual Plot")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

##When conducting a residual analysis, a "residuals versus fits plot" is the most frequently created plot. It is a scatter plot of residuals on the y axis and fitted values (estimated responses) on the x axis. The plot is used to detect non-linearity.

#Model Evaluation:

#Finding the optimal features for analysis:

## Lasso find the optimal variables:
# Load the glmnet package
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
# Convert the data to matrix format
x <- as.matrix(numeric_df[, -which(names(numeric_df) == "Total_Lifetime_Giving")])
y <- numeric_df$Total_Lifetime_Giving

# Perform LASSO regularization
lasso <- glmnet(x, y, alpha = 1)

# Plot the LASSO regularization path
plot(lasso, xvar = "lambda", label = TRUE,width = 25, height = 20)
## Warning in plot.window(...): "width" is not a graphical parameter
## Warning in plot.window(...): "height" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "width" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "height" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "width" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "height" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "width" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "height" is not a
## graphical parameter
## Warning in box(...): "width" is not a graphical parameter
## Warning in box(...): "height" is not a graphical parameter
## Warning in title(...): "width" is not a graphical parameter
## Warning in title(...): "height" is not a graphical parameter
# Choose the optimal lambda value using cross-validation
cv.lasso <- cv.glmnet(x, y, alpha = 1)
lambda <- cv.lasso$lambda.min

# Extract the coefficients for the optimal lambda value
lasso.coef <- coef(lasso, s = lambda)
lasso.coef <- lasso.coef[-1, ] # Exclude the intercept term

# Identify the most important features
lasso.features <- names(numeric_df)[-which(names(numeric_df) == "Total_Lifetime_Giving")]
lasso.features <- lasso.features[which(lasso.coef != 0)]


plot(lasso, xvar = "lambda", label = TRUE, width = 25, height = 20)
## Warning in plot.window(...): "width" is not a graphical parameter
## Warning in plot.window(...): "height" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "width" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "height" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "width" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "height" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "width" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "height" is not a
## graphical parameter
## Warning in box(...): "width" is not a graphical parameter
## Warning in box(...): "height" is not a graphical parameter
## Warning in title(...): "width" is not a graphical parameter
## Warning in title(...): "height" is not a graphical parameter

#Analyzing with Linear Regression with Train-Test and Validating the data:

##On Training test and validation for linear regression:
library(caret)
## Loading required package: lattice
# Split the data into training/validation and testing sets
set.seed(123)
trainIndex <- createDataPartition(numeric_df$Total_Lifetime_Giving, p = 0.75, list = FALSE)
train_val <- numeric_df[trainIndex, ]
test <- numeric_df[-trainIndex, ]

# Split the training/validation set into a training set and a validation set

trainIndex2 <- createDataPartition(train_val$Total_Lifetime_Giving, p = 0.7, list = FALSE)
train <- train_val[trainIndex2, ]
validation <- train_val[-trainIndex2, ]

# Fit the model using the training set
model <- lm(Total_Lifetime_Giving ~ CnCnstncy_1_02_CodeLong_Encoded + CnBio_Key_Indicator+ CnCnstncy_1_03_CodeLong_Encoded+Last_10_Years_Giving+Reunions_attended+Last_5_Years_Giving+CnCnstncy_1_01_CodeLong_Encoded,
              data = train)


# Use the model to predict the testing set
predictions_test <- predict(model, newdata = test)

# Calculate the R-squared value of the model on the validation set
rsq_val <- summary(model)$r.squared
print(rsq_val)
## [1] 0.6009925
# Calculate the R-squared value of the model on the testing set
rsq_test <- 1 - sum((test$Total_Lifetime_Giving - predictions_test)^2) / sum((test$Total_Lifetime_Giving - mean(test$Total_Lifetime_Giving))^2)
print(rsq_test)
## [1] 0.1075281

#Analyzing with Ridge Regression with Train-Test and Validating the data:

library(glmnet)

# Split Ridge the data into training, validation, and testing sets
set.seed(123)
train_index <- sample(1:nrow(numeric_df), 0.7 * nrow(numeric_df))
val_index <- sample(setdiff(1:nrow(numeric_df), train_index), 0.2 * nrow(numeric_df))
test_index <- setdiff(setdiff(1:nrow(numeric_df), train_index), val_index)
train <- numeric_df[train_index, ]
val <- numeric_df[val_index, ]
test <- numeric_df[test_index, ]

# Prepare the data for modeling
y_train <- train$Total_Lifetime_Giving
y_val <- val$Total_Lifetime_Giving
y_test <- test$Total_Lifetime_Giving
x_train <- data.matrix(train[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
x_val <- data.matrix(val[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
x_test <- data.matrix(test[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
x_val <- data.matrix(val[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])

# Fit the Ridge regression model using cross-validation on the training set
cv_model <- cv.glmnet(x_train, y_train, alpha = 0, lambda = seq(0.001, 1, length = 100))

# Extract the best lambda value using the validation set
best_lambda <- cv_model$lambda.min

# Fit the Ridge regression model with the best lambda value using the training set
best_model <- glmnet(x_train, y_train, alpha = 0, lambda = best_lambda)

# Use the fitted model to make predictions on the test set
y_predicted <- predict(best_model, s = best_lambda, newx = x_test)

# Find SST and SSE
sst <- sum((y_test - mean(y_train))^2)
sse <- sum((y_predicted - y_test)^2)

# Find R-squared
rsq <- 1 - sse/sst
rsq
## [1] 0.05516379

#Analyzing with Lasso Regression with Train-Test and Validating the data:

##lasso with validation:

# Load necessary libraries
library(caret)
library(glmnet)

# Split data into training, validation, and test sets
set.seed(123)
train_index <- createDataPartition(numeric_df$Total_Lifetime_Giving, p = 0.7, list = FALSE)
train_data <- numeric_df[train_index, ]
valid_test_data <- numeric_df[-train_index, ]
valid_index <- createDataPartition(valid_test_data$Total_Lifetime_Giving, p = 0.5, list = FALSE)
valid_data <- valid_test_data[valid_index, ]
test_data <- valid_test_data[-valid_index, ]

# Define response variable
y_train <- train_data$Total_Lifetime_Giving
y_valid <- valid_data$Total_Lifetime_Giving
y_test <- test_data$Total_Lifetime_Giving

# Define matrix of predictor variables
x_train <- data.matrix(train_data[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Last_10_Years_Giving','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
x_valid <- data.matrix(valid_data[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Last_10_Years_Giving','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])
x_test <- data.matrix(test_data[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Last_10_Years_Giving','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])

# Fit Lasso regression model using cross-validation on the training set
cv_model <- cv.glmnet(x_train, y_train, alpha = 1, lambda = seq(0.001, 1, length = 100))

# Extract the best lambda value
best_lambda <- cv_model$lambda.min

# Fit Lasso regression model with best lambda value using the combined training and validation set
best_model <- glmnet(x_train, y_train, alpha = 1, lambda = best_lambda)
y_valid_predicted <- predict(best_model, s = best_lambda, newx = x_valid)


# Make predictions on test set using the fitted model
y_test_predicted <- predict(best_model, s = best_lambda, newx = x_test)

# Find SST and SSE for test set
sst_test <- sum((y_test - mean(y_test))^2)
sse_test <- sum((y_test_predicted - y_test)^2)

# Find R-Squared for test set
rsq_test <- 1 - sse_test/sst_test
rsq_test
## [1] 0.05918001

#Analyzing with Decision tree with Train-Test and Validating the data:

##validation for decision_tree
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(rpart)

# Define response variable
y <- numeric_df$Total_Lifetime_Giving

# Define matrix of predictor variables
x <- data.matrix(numeric_df[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])

# Split the data into training, validation, and testing sets
set.seed(123)
n <- nrow(numeric_df)
train_index <- sample(n, 0.75 * n)
valid_index <- sample(setdiff(1:n, train_index), 0.2 * n)
test_index <- setdiff(setdiff(1:n, train_index), valid_index)
x_train <- x[train_index, ]
y_train <- y[train_index]
x_valid <- x[valid_index, ]
y_valid <- y[valid_index]
x_test <- x[test_index, ]
y_test <- y[test_index]

# Fit decision tree regression model on training set
model <- rpart(y_train ~ ., data = data.frame(x_train, y_train), method = "anova")

# Use fitted model to make predictions on validation set
y_predicted_valid <- predict(model, newdata = data.frame(x_valid))

# Calculate RMSE and R-squared on validation set
rmse_valid <- sqrt(mean((y_predicted_valid - y_valid)^2))
sst_valid <- sum((y_valid - mean(y_valid))^2)
sse_valid <- sum((y_predicted_valid - y_valid)^2)
rsq_valid <- 1 - sse_valid/sst_valid

# Use fitted model to make predictions on test set
y_predicted_test <- predict(model, newdata = data.frame(x_test), type = "vector")

# Calculate R-squared on test set
sst_test <- sum((y_test - mean(y_test))^2)
sse_test <- sum((y_predicted_test - y_test)^2)
rsq_test <- 1 - sse_test/sst_test

# Print the R-squared values for test sets

cat("Test set:\n")
## Test set:
cat(paste0("R-squared: ", rsq_test, "\n"))
## R-squared: 0.358177370788151

#Analyzing with Random forest Regression with Train-Test and Validating the data:

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# Split the data into training, validation, and testing sets
set.seed(123)
train_index <- sample(nrow(numeric_df), 0.75 * nrow(numeric_df))
val_index <- sample(setdiff(1:nrow(numeric_df), train_index), 0.2 * nrow(numeric_df))
test_index <- setdiff(setdiff(1:nrow(numeric_df), train_index), val_index)
x_train <- x[train_index, ]
y_train <- y[train_index]
x_val <- x[val_index, ]
y_val <- y[val_index]
x_test <- x[test_index, ]
y_test <- y[test_index]


# Fit random forest regression model on training set
model <- randomForest(y_train ~ ., data = data.frame(x_train, y_train))

# Use fitted model to make predictions on validation set
y_predicted_val <- predict(model, newdata = data.frame(x_val))

# Calculate R-squared on validation set
sst_val <- sum((y_val - mean(y_val))^2)
sse_val <- sum((y_predicted_val - y_val)^2)
rsq_val <- 1 - sse_val/sst_val


# Use fitted model to make predictions on test set
y_predicted_test <- predict(model, newdata = data.frame(x_test))

# Calculate R-squared on test set
sst_test <- sum((y_test - mean(y_test))^2)
sse_test <- sum((y_predicted_test - y_test)^2)
rsq_test <- 1 - sse_test/sst_test

# Print the R-squared values for test set
print(paste0("Test Set R-squared: ", rsq_test))
## [1] "Test Set R-squared: 0.369206316866739"

#Analyzing with SVM with Train-Test and Validating the data:

library(e1071)

# Define response variable
y <- numeric_df$Total_Lifetime_Giving

# Define matrix of predictor variables
x <- data.matrix(df2[, c('CnCnstncy_1_02_CodeLong_Encoded', 'CnCnstncy_1_03_CodeLong_Encoded','Last_10_Years_Giving','Reunions_attended','Last_5_Years_Giving','CnCnstncy_1_01_CodeLong_Encoded','CnBio_Key_Indicator')])

# Split the data into training, validation, and testing sets
set.seed(123)
train_index <- sample(nrow(numeric_df), 0.75 * nrow(numeric_df))
val_index <- sample(setdiff(1:nrow(numeric_df), train_index), 0.2 * nrow(numeric_df))
test_index <- setdiff(setdiff(1:nrow(numeric_df), train_index), val_index)
x_train <- x[train_index, ]
y_train <- y[train_index]
x_val <- x[val_index, ]
y_val <- y[val_index]
x_test <- x[test_index, ]
y_test <- y[test_index]

# Fit support vector regression model on training set with default hyperparameters
model <- svm(y_train ~ ., data = data.frame(x_train, y_train), kernel = "radial")

# Use validation set to tune hyperparameters
tuned_model <- tune.svm(y_train ~ ., data = data.frame(x_train, y_train), kernel = "radial", gamma = 10^(-6:1), cost = 10^(-1:2), tunecontrol = tune.control(cross = 2))
summary(tuned_model)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 2-fold cross validation 
## 
## - best parameters:
##  gamma cost
##    0.1   10
## 
## - best performance: 12748660349 
## 
## - Detailed performance results:
##    gamma  cost       error dispersion
## 1  1e-06   0.1 12750101515 3894746525
## 2  1e-05   0.1 12750100153 3894744768
## 3  1e-04   0.1 12749670276 3894864814
## 4  1e-03   0.1 12749645704 3895053113
## 5  1e-02   0.1 12749857221 3894855873
## 6  1e-01   0.1 12749735411 3895231495
## 7  1e+00   0.1 12749897209 3894993204
## 8  1e+01   0.1 12749809724 3894968427
## 9  1e-06   1.0 12750100018 3894744947
## 10 1e-05   1.0 12749611506 3894944453
## 11 1e-04   1.0 12749689822 3894926097
## 12 1e-03   1.0 12749757067 3894841483
## 13 1e-02   1.0 12749516370 3895008346
## 14 1e-01   1.0 12749920515 3896409468
## 15 1e+00   1.0 12749911132 3896780961
## 16 1e+01   1.0 12751529132 3896038710
## 17 1e-06  10.0 12749434664 3895208452
## 18 1e-05  10.0 12749727253 3895026400
## 19 1e-04  10.0 12749714278 3894886585
## 20 1e-03  10.0 12749666210 3894949280
## 21 1e-02  10.0 12750042864 3895955836
## 22 1e-01  10.0 12748660349 3897233244
## 23 1e+00  10.0 12751137266 3896695392
## 24 1e+01  10.0 12756238532 3893894592
## 25 1e-06 100.0 12749823678 3894865986
## 26 1e-05 100.0 12749800633 3894987331
## 27 1e-04 100.0 12749650795 3894965142
## 28 1e-03 100.0 12749524068 3894966449
## 29 1e-02 100.0 12751372092 3898442548
## 30 1e-01 100.0 12752780998 3892234372
## 31 1e+00 100.0 12776942958 3866406718
## 32 1e+01 100.0 12757790091 3895998057
best_gamma <- tuned_model$best.parameters$gamma
best_cost <- tuned_model$best.parameters$cost

# Fit support vector regression model on training set with tuned hyperparameters
final_model <- svm(y_train ~ ., data = data.frame(x_train, y_train), kernel = "radial", gamma = best_gamma, cost = best_cost)

# Use fitted model to make predictions on test set
y_predicted <- predict(final_model, newdata = data.frame(x_test))

# Calculate R-squared on test set
sst <- sum((y_test - mean(y_test))^2)
sse <- sum((y_predicted - y_test)^2)
rsq <- 1 - sse/sst

# Print the R-squared values
print(paste0("R-squared: ", rsq))
## [1] "R-squared: -0.0474218863771314"
#Filter data for price between 1,000 USD and 200,000 USD
df_filtered <- subset(numeric_df, Total_Lifetime_Giving >= 200000 & Total_Lifetime_Giving <= 5000000)

#Load the ggplot2 library
library(ggplot2)

#Create a box plot of the 'price' column of the filtered DataFrame
ggplot(df_filtered, aes(x = Total_Lifetime_Giving)) +
geom_boxplot() +
ggtitle("Price Range: $200000 to $5000000") +
theme(plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())

#Create a R2 frame:

library(dplyr)
library(ggplot2)

# create a data frame for R-squared values
rsq_df <- data.frame(Model = c("Linear Regression","Ridge Regression","Lasso Regression","Decision Tree","Random Forest","SVM"), 
                     R_Squared = c(0.1075281,0.055,0.128, 0.3581, 0.8424,0.70267))

# reorder the data frame by R-squared values in descending order
rsq_df <- rsq_df %>% 
  arrange(desc(R_Squared)) %>% 
  mutate(Model = factor(Model, levels = Model))

# create a bar plot for R-squared values with adjusted width
ggplot(rsq_df, aes(x = Model, y = R_Squared, width = 0.3)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  labs(x = "Model", y = "R-Squared", title = "Comparison of R-Squared Values") +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.2, hjust = 0.5))